Using the kiln pressure data we can generate features to aid in concreteley determining whether certain patterns produce better yields. Some of the features discussed include:
Area under the curve was originally going to be calculated but after some thought, time spent within the pressure ranges seemed a more useful metric. Could potentially revisit an AUC calculation later on.
press_calcs <- kilns_All %>%
group_by(LOTNO) %>%
mutate(
# adust pressure max and min
pressure = case_when(pressure > .07 ~ .07,
pressure < -.07 ~ -.07,
TRUE ~ pressure
),
# find time index where temp reaches 1200F
close_1200 = if_else( (time < index_max_temp), (abs(1200 - avg_kiln_temp)), NULL ))
# join time index to original
press_calcs <- left_join(press_calcs, press_calcs %>%
group_by(LOTNO) %>%
dplyr::summarise(index_1200F = which.min(close_1200))
)
press_calcs <- press_calcs %>%
group_by(LOTNO) %>%
mutate(
# segregate pressures into positive and negative columns
press_pos = if_else((pressure >= 0) & (time <= index_1200F), pressure, NULL),
press_neg = if_else((pressure < 0) & (time <= index_1200F), pressure, NULL),
# get all pressures below 1200F point for easy mean, sd calcs
press_1200F = if_else(time <= index_1200F, pressure, NULL),
# segregate pressures into range columns
press_greater_03 = if_else((pressure >= .03) & (time <= index_1200F), pressure, NULL),
press_betw_02_03 = if_else((pressure >= .02) & (pressure < .03) & (time <= index_1200F), pressure, NULL),
press_betw_01_02 = if_else((pressure >= .01) & (pressure < .02) & (time <= index_1200F), pressure, NULL),
press_betw_00_01 = if_else((pressure >= .0) & (pressure < .01) & (time <= index_1200F), pressure, NULL),
press_less_00 = if_else((pressure < 0) & (time <= index_1200F), pressure, NULL),
# add counter columns for easy aggregation of total times spent in range
press_pos_count = if_else(!is.na(press_pos),1,0),
press_neg_count = if_else(!is.na(press_neg),1,0),
press_greater_03_count = if_else(!is.na(press_greater_03),1,0),
press_betw_02_03_count = if_else(!is.na(press_betw_02_03),1,0),
press_betw_01_02_count = if_else(!is.na(press_betw_01_02),1,0),
press_betw_00_01_count = if_else(!is.na(press_betw_00_01),1,0),
press_less_00_count = if_else(!is.na(press_less_00),1,0)
) %>%
# remove helper columns
dplyr::select(-c(close_1200)) %>%
dplyr::select(c(LOTNO,index_max_temp,index_1200F,everything()))
# summarise times at pressures
press_summ <- press_calcs %>%
group_by(LOTNO) %>%
dplyr::summarise(
# sum times at negative and positive pressure
time_at_pos = sum(press_pos_count, na.rm=TRUE),
time_at_neg = sum(press_neg_count, na.rm=TRUE),
# get means and sd of pressures
mean_press = mean(press_1200F, na.rm=TRUE),
sd_press = sd(press_1200F, na.rm=TRUE),
# sum times at pressure ranges
time_greater_03 = sum(press_greater_03_count, na.rm=TRUE),
time_betw_02_03 = sum(press_betw_02_03_count, na.rm=TRUE),
time_betw_01_02 = sum(press_betw_01_02_count, na.rm=TRUE),
time_betw_00_01 = sum(press_betw_00_01_count, na.rm=TRUE),
time_less_00 = sum(press_less_00_count, na.rm=TRUE)
)
# join summarised features to original pressure data for easy plotting and verification
press_joined <- left_join(press_calcs, press_summ) %>%
mutate(
pct_time_at_pos = time_at_pos/index_1200F,
pct_time_at_neg = time_at_neg/index_1200F,
pct_time_greater_03 = time_greater_03/index_1200F,
pct_time_betw_02_03 = time_betw_02_03/index_1200F,
pct_time_betw_01_02 = time_betw_01_02/index_1200F,
pct_time_betw_00_01 = time_betw_00_01/index_1200F,
pct_time_less_00 = time_less_00/index_1200F
) %>%
# remove helper columns
dplyr::select(-c(press_pos_count,
press_neg_count,
press_greater_03_count,
press_betw_02_03_count,
press_betw_01_02_count,
press_betw_00_01_count,
press_less_00_count,
press_1200F
))
# attached aucDiff to each df
press_summ <- left_join(press_summ, allAucValues)
press_joined <- left_join(press_joined, allAucValues)
# add kiln column for quick filtering
press_summ <- press_summ %>%
mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]")))
# add weather data as well..
press_summ <- df_yields %>%
group_by(LOTNO) %>% slice(1) %>% ungroup() %>%
dplyr::select(LOTNO, precip:temp_avg) %>%
right_join(press_summ)
# prepare lot yield metric
# get lot yields by summarising fired and rejects
lot_yields <-
df_yields %>%
group_by(LOTNO) %>%
dplyr::summarise(
lot_items_fired = sum(TOTAL_ITEM_FIRED),
lot_items_rejected = sum(TOTAL_ITEM_REJECTED),
lot_yield = (lot_items_fired - lot_items_rejected) / lot_items_fired
) %>%
dplyr::select(LOTNO, lot_yield)
# join yields with features, remove 20 missing lot yield values
press_summ <- left_join(press_summ, lot_yields) %>%
na.omit()
#
# significance function
cor.mtest <- function(mat, ...) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j], ...)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
Quick visualization of randomly selected lots to verify that metrics seem to be produced correctly.
# sample random lots
set.seed(123)
sampleC <- sample(levels(kilns_C$LOTNO),1)
sampleD <- sample(levels(kilns_D$LOTNO),1)
sampleE <- sample(levels(kilns_E$LOTNO),1)
sampleF <- sample(levels(kilns_F$LOTNO),1)
sampleG <- sample(levels(kilns_G$LOTNO),2)
sampleH <- sample(levels(kilns_H$LOTNO),2)
sample <- c(sampleC, sampleD, sampleE, sampleF)
# scale and shift values for second y-axis
sec_y_scale=20000
sec_y_shift=1500
# plot
press_joined %>%
dplyr::filter(LOTNO %in% sample) %>%
# KILN TEMP, SETPOINT, PRESSURE
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# PRESSURE POINTS
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
# HLINES
geom_hline(aes(yintercept = .0 * sec_y_scale + sec_y_shift),color='red')+
geom_hline(aes(yintercept = .01 * sec_y_scale + sec_y_shift),color='red',linetype='dashed')+
geom_hline(aes(yintercept = .02 * sec_y_scale + sec_y_shift),color='green',linetype='dashed')+
geom_hline(aes(yintercept = .03 * sec_y_scale + sec_y_shift),color='red',linetype='dashed')+
# mean/sd
# geom_hline(aes(yintercept = mean_press * sec_y_scale + sec_y_shift), color = 'blue',linetype='dotdash',size=1)+
# geom_hline(aes(yintercept = (mean_press+sd_press) * sec_y_scale + sec_y_shift), color = 'blue2',linetype='dotted',size=1)+
# geom_hline(aes(yintercept = (mean_press-sd_press) * sec_y_scale + sec_y_shift), color = 'blue2',linetype='dotted',size=1)+
# PRESSURE RIBBONS
# geom_ribbon(aes(ymin = press_less_00 * sec_y_scale + sec_y_shift, ymax = 0 * sec_y_scale + sec_y_shift),fill='red',alpha=1)+
# geom_ribbon(aes(ymin = 0.00 * sec_y_scale + sec_y_shift, ymax = press_betw_00_01 * sec_y_scale +sec_y_shift),fill='yellow3',alpha=1)+
# geom_ribbon(aes(ymin = 0.0 * sec_y_scale + sec_y_shift, ymax = press_betw_01_03 * sec_y_scale + sec_y_shift),fill='green',alpha=1)+
# geom_ribbon(aes(ymin = 0.0 * sec_y_scale + sec_y_shift, ymax = press_greater_03 * sec_y_scale + sec_y_shift),fill='red',alpha=1)+
# zoom on ribbons
# scale_x_continuous(limits = c(0,18.4*60))+
# scale_y_continuous(limits = c(1250,2350))
# setpoint, temp AUC ribbon
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# labels
# AUC
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(aucDiff),
aes(x = 60 * 30, y = -0.065 * sec_y_scale + sec_y_shift,
label = paste0( "AUC: ", round(aucDiff,0) )
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='pink',label.size=.1
) +
# MEAN
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(mean_press),
aes(x = 60 * 30, y = -0.02 * sec_y_scale + sec_y_shift,
label = paste0( "mean: ", round(mean_press,3) )
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='lightskyblue',label.size=.1
) +
# SD
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(sd_press),
aes(x = 60 * 30, y = -0.03 * sec_y_scale + sec_y_shift,
label = paste0( "sd: ", round(sd_press,3) )
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='lightskyblue2',label.size=.1
) +
# POS PRESS
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_at_pos, pct_time_at_pos, index_1200F),
aes(x = 60 * 72, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( time_at_pos, "m, ",
mypercent(pct_time_at_pos))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='grey80',label.size=.1
) +
# NEG PRESS
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_at_neg, pct_time_at_neg, index_1200F),
aes(x = 60 * 72, y = -0.06 * sec_y_scale + sec_y_shift,
label = paste0( time_at_neg, "m, ",
mypercent(pct_time_at_neg))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='grey90'
)+
# LESS 00
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_less_00, pct_time_less_00, index_1200F),
aes(x = 60 * 72, y = -.02 * sec_y_scale + sec_y_shift,
label = paste0( time_less_00, "m, ",
mypercent(pct_time_less_00))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='red'
)+
# BETW 00 01
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_00_01, pct_time_betw_00_01, index_1200F),
aes(x = 60 * 72, y = 0.0 * sec_y_scale + sec_y_shift,
label = paste0( time_betw_00_01, "m, ",
mypercent(pct_time_betw_00_01))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='yellow3'
) +
# BETW 01 02
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_01_02, pct_time_betw_01_02, index_1200F),
aes(x = 60 * 72, y = .015 * sec_y_scale + sec_y_shift,
label = paste0( time_betw_01_02, "m, ",
mypercent(pct_time_betw_01_02))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='green'
) +
# BETW 02 03
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_02_03, pct_time_betw_02_03, index_1200F),
aes(x = 60 * 72, y = .025 * sec_y_scale + sec_y_shift,
label = paste0( time_betw_02_03, "m, ",
mypercent(pct_time_betw_02_03))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='green3'
) +
# GREATER 03
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_greater_03, pct_time_greater_03, index_1200F),
aes(x = 60 * 72, y = 0.04 * sec_y_scale + sec_y_shift,
label = paste0( time_greater_03, "m, ",
mypercent(pct_time_greater_03))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='red'
) +
facet_wrap(~LOTNO)
# sample random lots
set.seed(8634)
sample <- sample(levels(kilns_All$LOTNO),4)
# plot
press_joined %>%
dplyr::filter(LOTNO %in% sample) %>%
# KILN TEMP, SETPOINT, PRESSURE
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# PRESSURE POINTS
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
# HLINES
geom_hline(aes(yintercept = .0 * sec_y_scale + sec_y_shift),color='red')+
geom_hline(aes(yintercept = .01 * sec_y_scale + sec_y_shift),color='red',linetype='dashed')+
geom_hline(aes(yintercept = .02 * sec_y_scale + sec_y_shift),color='green',linetype='dashed')+
geom_hline(aes(yintercept = .03 * sec_y_scale + sec_y_shift),color='red',linetype='dashed')+
# mean/sd
# geom_hline(aes(yintercept = mean_press * sec_y_scale + sec_y_shift), color = 'blue',linetype='dotdash',size=1)+
# geom_hline(aes(yintercept = (mean_press+sd_press) * sec_y_scale + sec_y_shift), color = 'blue2',linetype='dotted',size=1)+
# geom_hline(aes(yintercept = (mean_press-sd_press) * sec_y_scale + sec_y_shift), color = 'blue2',linetype='dotted',size=1)+
# PRESSURE RIBBONS
# geom_ribbon(aes(ymin = press_less_00 * sec_y_scale + sec_y_shift, ymax = 0 * sec_y_scale + sec_y_shift),fill='red',alpha=1)+
# geom_ribbon(aes(ymin = 0.00 * sec_y_scale + sec_y_shift, ymax = press_betw_00_01 * sec_y_scale +sec_y_shift),fill='yellow3',alpha=1)+
# geom_ribbon(aes(ymin = 0.0 * sec_y_scale + sec_y_shift, ymax = press_betw_01_03 * sec_y_scale + sec_y_shift),fill='green',alpha=1)+
# geom_ribbon(aes(ymin = 0.0 * sec_y_scale + sec_y_shift, ymax = press_greater_03 * sec_y_scale + sec_y_shift),fill='red',alpha=1)+
# zoom on ribbons
# scale_x_continuous(limits = c(0,18.4*60))+
# scale_y_continuous(limits = c(1250,2350))
# setpoint, temp AUC ribbon
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# labels
# AUC
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(aucDiff),
aes(x = 60 * 30, y = -0.065 * sec_y_scale + sec_y_shift,
label = paste0( "AUC: ", round(aucDiff,0) )
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='pink',label.size=.1
) +
# MEAN
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(mean_press),
aes(x = 60 * 30, y = -0.02 * sec_y_scale + sec_y_shift,
label = paste0( "mean: ", round(mean_press,3) )
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='lightskyblue',label.size=.1
) +
# SD
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(sd_press),
aes(x = 60 * 30, y = -0.03 * sec_y_scale + sec_y_shift,
label = paste0( "sd: ", round(sd_press,3) )
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='lightskyblue2',label.size=.1
) +
# POS PRESS
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_at_pos, pct_time_at_pos, index_1200F),
aes(x = 60 * 72, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( time_at_pos, "m, ",
mypercent(pct_time_at_pos))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='grey80',label.size=.1
) +
# NEG PRESS
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_at_neg, pct_time_at_neg, index_1200F),
aes(x = 60 * 72, y = -0.06 * sec_y_scale + sec_y_shift,
label = paste0( time_at_neg, "m, ",
mypercent(pct_time_at_neg))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='grey90'
)+
# LESS 00
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_less_00, pct_time_less_00, index_1200F),
aes(x = 60 * 72, y = -.02 * sec_y_scale + sec_y_shift,
label = paste0( time_less_00, "m, ",
mypercent(pct_time_less_00))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='red'
)+
# BETW 00 01
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_00_01, pct_time_betw_00_01, index_1200F),
aes(x = 60 * 72, y = 0.0 * sec_y_scale + sec_y_shift,
label = paste0( time_betw_00_01, "m, ",
mypercent(pct_time_betw_00_01))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='yellow3'
) +
# BETW 01 02
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_01_02, pct_time_betw_01_02, index_1200F),
aes(x = 60 * 72, y = .015 * sec_y_scale + sec_y_shift,
label = paste0( time_betw_01_02, "m, ",
mypercent(pct_time_betw_01_02))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='green'
) +
# BETW 02 03
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_02_03, pct_time_betw_02_03, index_1200F),
aes(x = 60 * 72, y = .025 * sec_y_scale + sec_y_shift,
label = paste0( time_betw_02_03, "m, ",
mypercent(pct_time_betw_02_03))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='green3'
) +
# GREATER 03
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_greater_03, pct_time_greater_03, index_1200F),
aes(x = 60 * 72, y = 0.04 * sec_y_scale + sec_y_shift,
label = paste0( time_greater_03, "m, ",
mypercent(pct_time_greater_03))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='red'
) +
facet_wrap(~LOTNO)
Distribution check of new features shows generally skewed and heavy-tailed data.
# pivot data and plot histograms
press_summ %>%
dplyr::select(-LOTNO) %>%
dplyr::select(time_at_pos:lot_yield) %>%
pivot_longer(cols = is.numeric) %>%
mutate_if(is.character, factor) %>%
ggplot(aes(x=value))+
geom_freqpoly(bins=15)+
facet_wrap(~name, scales='free')
Mean pressure while temperature is less than 1200F is much higher for kilns G and H.
press_summ %>%
mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]"))) %>%
dplyr::select(-LOTNO) %>%
dplyr::select(mean_press,kiln) %>%
ggplot(aes(x=mean_press))+
geom_vline(aes(xintercept=0),linetype='dashed',color='red')+
geom_freqpoly(bins=15)+
facet_wrap(~kiln)
press_summ %>%
mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]"))) %>%
dplyr::select(-LOTNO) %>%
dplyr::select(time_less_00,kiln) %>%
ggplot(aes(x=time_less_00))+
geom_vline(aes(xintercept=0),linetype='dashed',color='red')+
geom_freqpoly(bins=15)+
facet_wrap(~kiln)
press_summ %>%
mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]"))) %>%
dplyr::select(-LOTNO) %>%
dplyr::select(time_betw_00_01,kiln) %>%
ggplot(aes(x=time_betw_00_01))+
geom_vline(aes(xintercept=0),linetype='dashed',color='red')+
geom_freqpoly(bins=15)+
facet_wrap(~kiln)
Kilns G (especially) and H have far more time between .01 and .02 pressures.
press_summ %>%
mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]"))) %>%
dplyr::select(-LOTNO) %>%
dplyr::select(time_betw_01_02,kiln) %>%
ggplot(aes(x=time_betw_01_02))+
geom_vline(aes(xintercept=0),linetype='dashed',color='red')+
geom_freqpoly(bins=15)+
facet_wrap(~kiln)
Kilns G (especially) and H have far more time between .02 and .03 pressures.
press_summ %>%
mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]"))) %>%
dplyr::select(-LOTNO) %>%
dplyr::select(time_betw_02_03,kiln) %>%
ggplot(aes(x=time_betw_02_03))+
geom_vline(aes(xintercept=0),linetype='dashed',color='red')+
geom_freqpoly(bins=15)+
facet_wrap(~kiln)
press_summ %>%
mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]"))) %>%
dplyr::select(-LOTNO) %>%
dplyr::select(time_greater_03,kiln) %>%
ggplot(aes(x=time_greater_03))+
geom_vline(aes(xintercept=0),linetype='dashed',color='red')+
geom_freqpoly(bins=15)+
facet_wrap(~kiln)
Quick check of linear correlation of lot yields versus all features (including temps).
Only kiln E exhibits significant linear correlations (p > 0.05) with any pressure features.
No significant correlations.
M <- press_summ %>%
dplyr::filter(kiln == "C") %>%
dplyr::select(c(-LOTNO,-kiln,-time_at_pos,-time_at_neg)) %>%
cor()
M <- M[-9,-9]
p.mat <- cor.mtest(M)
# corrplot(as.matrix(M[8,]), method="number",
# p.mat = as.matrix(p.mat[8,])
# , sig.level = 0.05)
corrplot(as.matrix(M[14,]), method="number",
p.mat = as.matrix(p.mat[14,]), sig.level = 0.05)
Many Weak negative correlations, especially with time_betw_01_02, time_betw_02_03, and aucDiff.
M <- press_summ %>%
dplyr::filter(kiln == "D") %>%
dplyr::select(c(-LOTNO,-kiln,-time_at_pos,-time_at_neg)) %>%
cor()
p.mat <- cor.mtest(M)
corrplot(as.matrix(M[15,]), method="number",
p.mat = as.matrix(p.mat[15])
, sig.level = 0.05)
g1 <- press_summ %>%
dplyr::filter(kiln == "D") %>%
ggplot(aes(x=time_betw_01_02, y=lot_yield))+
geom_point()+
geom_smooth(method="lm")
g2 <- press_summ %>%
dplyr::filter(kiln == "D") %>%
ggplot(aes(x=time_betw_02_03, y=lot_yield))+
geom_point()+
geom_smooth(method="lm")
g3 <- press_summ %>%
dplyr::filter(kiln == "D") %>%
ggplot(aes(x=aucDiff, y=lot_yield))+
geom_point()+
geom_smooth(method="lm")
grid.arrange(g1,g2,g3,nrow=1)
Strong negative correlations with sd_press, time_greater_03, and time_betw_02_03. As these values increase, lot yields decrease.
M <- press_summ %>%
dplyr::filter(kiln == "E") %>%
dplyr::select(c(-LOTNO,-kiln,-time_at_pos,-time_at_neg)) %>%
cor()
p.mat <- cor.mtest(M)
corrplot(as.matrix(M[15,]), method = "number",
p.mat = as.matrix(p.mat[15,])
, sig.level = 0.05)
g1 <- press_summ %>%
dplyr::filter(kiln == "E") %>%
ggplot(aes(x=sd_press, y=lot_yield))+
geom_point()+
geom_smooth(method='lm')
g2 <- press_summ %>%
dplyr::filter(kiln == "E") %>%
ggplot(aes(x=time_greater_03, y=lot_yield,group=time_greater_03))+
geom_point()+
geom_smooth(method='lm')
grid.arrange(g1,g2,nrow=1)
No significant correlations.
M <- press_summ %>%
dplyr::filter(kiln == "F") %>%
dplyr::select(c(-LOTNO,-kiln,-time_at_pos,-time_at_neg)) %>%
cor()
p.mat <- cor.mtest(M)
corrplot(as.matrix(M[15,]), method = "number",
p.mat = as.matrix(p.mat[15,])
, sig.level = 0.05)
No significant correlations.
M <- press_summ %>%
dplyr::filter(kiln == "G") %>%
dplyr::select(c(-LOTNO,-kiln,-time_at_pos,-time_at_neg)) %>%
cor()
p.mat <- cor.mtest(M)
corrplot(as.matrix(M[15,]), method = "number",
p.mat = as.matrix(p.mat[15,])
, sig.level = 0.05)
No significant correlations.
M <- press_summ %>%
dplyr::filter(kiln == "H") %>%
dplyr::select(c(-LOTNO,-kiln,-time_at_pos,-time_at_neg)) %>%
cor()
p.mat <- cor.mtest(M)
corrplot(as.matrix(M[15,]), method = "number",
p.mat = as.matrix(p.mat[15,])
, sig.level = 0.05)
\(R^2\) values in general are very poor with the exception of kiln E. Though most of these values are inflated due to training on the entire dataset rather than splitting into test and train sets.
# linear model on each kiln
df <- press_summ %>%
dplyr::select(-LOTNO,-time_at_pos,-time_at_neg, -temp_avg)
set.seed(234)
df_split <- initial_split(df)
df_train <- training(df_split)
df_test <- testing(df_split)
lms <- df %>%
nest(data = c(-kiln)) %>%
mutate(fit = map(data, ~lm(lot_yield ~ ., data = .x)),
tidied = map(fit, tidy),
glanced = map(fit, glance),
augmented = map(fit, augment)
)
# glanced
lm_glance <- lms %>%
unnest(glanced) %>%
select(-data,-fit,-tidied,-augmented) %>%
select(kiln, r.squared,AIC,BIC) %>%
arrange(-r.squared) %>%
mutate_if(is.numeric, round, 5)
kable(lm_glance %>%
left_join(count(press_summ$kiln) %>%
mutate(kiln = factor(x)) %>%
select(kiln, freq)) %>%
select(kiln, freq, everything()),
format="html") %>%
kable_styling(full_width=T)
| kiln | freq | r.squared | AIC | BIC |
|---|---|---|---|---|
| E | 15 | 0.99846 | -89.84502 | -79.22427 |
| D | 87 | 0.23664 | -217.41242 | -180.42380 |
| C | 53 | 0.21330 | -128.16283 | -100.57875 |
| G | 232 | 0.09174 | -490.19878 | -438.49772 |
| H | 182 | 0.08479 | -303.27661 | -255.21651 |
| F | 117 | 0.03784 | -177.39467 | -135.96207 |
# tidied
lm_tidy <- lms %>%
unnest(tidied) %>%
select(-data,-fit,-glanced,-augmented) %>%
arrange(kiln, p.value) %>%
mutate_if(is.numeric, round, 5) %>%
mutate(
p.value = ifelse(p.value < .05,
cell_spec(p.value,"html",color="red"),
cell_spec(p.value,"html",color="black"))
)
kable(lm_tidy %>% dplyr::filter(kiln == "C"), format="html",escape = F) %>%
pack_rows( index=c("C" = 14)) %>%
kable_styling("striped", full_width=T)
| kiln | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| C | |||||
| C | (Intercept) | 0.85120 | 0.09854 | 8.63805 | 0 |
| C | sd_press | 11.56149 | 11.55899 | 1.00022 | 0.32322 |
| C | temp_max | 0.00106 | 0.00111 | 0.95615 | 0.34474 |
| C | aucDiff | 0.00000 | 0.00000 | -0.83611 | 0.40806 |
| C | mean_press | -8.00199 | 9.73070 | -0.82235 | 0.41576 |
| C | precip | -0.03041 | 0.04382 | -0.69409 | 0.49164 |
| C | time_less_00 | -0.00003 | 0.00006 | -0.59942 | 0.55227 |
| C | time_betw_00_01 | 0.00003 | 0.00006 | 0.51597 | 0.60871 |
| C | temp_min | 0.00060 | 0.00128 | 0.46870 | 0.64183 |
| C | snow_fall | -0.00245 | 0.01420 | -0.17259 | 0.86385 |
| C | snow_depth | -0.00078 | 0.00453 | -0.17238 | 0.86401 |
| C | time_betw_01_02 | 0.00026 | 0.00244 | 0.10564 | 0.9164 |
| C | time_betw_02_03 | -0.00005 | 0.00717 | -0.00764 | 0.99394 |
| C | time_greater_03 | NA | NA | NA | NA |
kable(lm_tidy %>% dplyr::filter(kiln == "D"), format="html",escape = F) %>%
pack_rows( index=c("D" = 14)) %>%
kable_styling("striped", full_width=T)
| kiln | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| D | |||||
| D | (Intercept) | 0.91481 | 0.05624 | 16.26736 | 0 |
| D | time_betw_02_03 | -0.05878 | 0.02283 | -2.57484 | 0.01205 |
| D | aucDiff | 0.00000 | 0.00000 | -2.18340 | 0.03222 |
| D | snow_fall | 0.02275 | 0.01551 | 1.46668 | 0.14676 |
| D | time_less_00 | 0.00007 | 0.00006 | 1.35300 | 0.18023 |
| D | time_betw_01_02 | -0.00128 | 0.00117 | -1.08975 | 0.27941 |
| D | snow_depth | -0.00339 | 0.00344 | -0.98540 | 0.32768 |
| D | time_greater_03 | 0.01171 | 0.01829 | 0.64028 | 0.524 |
| D | sd_press | -3.84453 | 7.07149 | -0.54367 | 0.58833 |
| D | time_betw_00_01 | 0.00004 | 0.00009 | 0.48635 | 0.62817 |
| D | mean_press | 4.34886 | 14.00702 | 0.31048 | 0.75708 |
| D | temp_min | -0.00027 | 0.00105 | -0.25494 | 0.79948 |
| D | precip | 0.00619 | 0.02769 | 0.22368 | 0.82363 |
| D | temp_max | 0.00008 | 0.00095 | 0.07914 | 0.93714 |
kable(lm_tidy %>% dplyr::filter(kiln == "E"), format="html",escape = F) %>%
pack_rows( index=c("E" = 14)) %>%
kable_styling("striped", full_width=T)
| kiln | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| E | |||||
| E | time_betw_01_02 | 0.00182 | 0.00018 | 10.32889 | 0.06144 |
| E | mean_press | -108.85279 | 14.60849 | -7.45134 | 0.08493 |
| E | time_betw_00_01 | 0.00078 | 0.00014 | 5.41257 | 0.11631 |
| E | temp_min | 0.00823 | 0.00185 | 4.45509 | 0.14057 |
| E | temp_max | -0.00621 | 0.00145 | -4.27460 | 0.1463 |
| E | (Intercept) | 0.58429 | 0.20431 | 2.85989 | 0.21414 |
| E | snow_depth | 0.08660 | 0.03851 | 2.24871 | 0.26639 |
| E | snow_fall | -0.18387 | 0.09017 | -2.03903 | 0.29027 |
| E | time_less_00 | 0.00026 | 0.00013 | 2.00809 | 0.29414 |
| E | time_greater_03 | -0.06790 | 0.03802 | -1.78606 | 0.32493 |
| E | aucDiff | 0.00000 | 0.00000 | -0.93358 | 0.52186 |
| E | precip | -0.17567 | 0.21740 | -0.80802 | 0.56735 |
| E | sd_press | -18.66761 | 32.50378 | -0.57432 | 0.66811 |
| E | time_betw_02_03 | 0.00242 | 0.02054 | 0.11797 | 0.92525 |
kable(lm_tidy %>% dplyr::filter(kiln == "F"), format="html",escape = F) %>%
pack_rows( index=c("G" = 14)) %>%
kable_styling("striped", full_width=T)
| kiln | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| G | |||||
| F | (Intercept) | 0.88886 | 0.09748 | 9.11789 | 0 |
| F | temp_max | 0.00170 | 0.00129 | 1.32252 | 0.18892 |
| F | temp_min | -0.00172 | 0.00140 | -1.22340 | 0.22397 |
| F | aucDiff | 0.00000 | 0.00000 | 0.75490 | 0.45203 |
| F | time_betw_01_02 | 0.00028 | 0.00046 | 0.61192 | 0.54194 |
| F | sd_press | 2.99793 | 7.30504 | 0.41039 | 0.68237 |
| F | time_greater_03 | 0.00088 | 0.00237 | 0.37248 | 0.7103 |
| F | time_less_00 | -0.00003 | 0.00008 | -0.37204 | 0.71063 |
| F | time_betw_00_01 | -0.00003 | 0.00008 | -0.37046 | 0.7118 |
| F | snow_depth | -0.00137 | 0.00484 | -0.28324 | 0.77756 |
| F | snow_fall | -0.00484 | 0.01971 | -0.24561 | 0.80647 |
| F | precip | -0.00835 | 0.04027 | -0.20746 | 0.83606 |
| F | time_betw_02_03 | -0.00017 | 0.00186 | -0.09120 | 0.92751 |
| F | mean_press | 0.26109 | 11.37128 | 0.02296 | 0.98173 |
kable(lm_tidy %>% dplyr::filter(kiln == "G"), format="html",escape = F) %>%
pack_rows( index=c("G" = 14)) %>%
kable_styling("striped", full_width=T)
| kiln | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| G | |||||
| G | (Intercept) | 0.69377 | 0.07871 | 8.81458 | 0 |
| G | time_greater_03 | -0.00060 | 0.00019 | -3.08980 | 0.00226 |
| G | time_betw_00_01 | 0.00013 | 0.00005 | 2.65998 | 0.0084 |
| G | mean_press | 14.07606 | 5.94712 | 2.36687 | 0.01881 |
| G | time_less_00 | 0.00016 | 0.00009 | 1.90628 | 0.05793 |
| G | sd_press | 6.07222 | 3.25883 | 1.86331 | 0.06376 |
| G | aucDiff | 0.00001 | 0.00000 | 1.24822 | 0.21329 |
| G | time_betw_01_02 | -0.00006 | 0.00006 | -0.95087 | 0.34273 |
| G | snow_fall | -0.00502 | 0.00590 | -0.85106 | 0.39567 |
| G | temp_max | -0.00051 | 0.00066 | -0.77430 | 0.43959 |
| G | time_betw_02_03 | -0.00003 | 0.00010 | -0.31542 | 0.75275 |
| G | snow_depth | 0.00081 | 0.00303 | 0.26627 | 0.79028 |
| G | temp_min | -0.00003 | 0.00075 | -0.04422 | 0.96477 |
| G | precip | -0.00067 | 0.02223 | -0.03026 | 0.97589 |
kable(lm_tidy %>% dplyr::filter(kiln == "H"), format="html",escape = F) %>%
pack_rows( index=c("H" = 14)) %>%
kable_styling("striped", full_width=T)
| kiln | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| H | |||||
| H | (Intercept) | 0.88829 | 0.14374 | 6.17995 | 0 |
| H | aucDiff | 0.00000 | 0.00000 | -2.91740 | 0.00401 |
| H | time_betw_01_02 | 0.00022 | 0.00014 | 1.63055 | 0.10486 |
| H | temp_max | -0.00111 | 0.00102 | -1.09009 | 0.27723 |
| H | time_betw_00_01 | 0.00011 | 0.00012 | 0.93691 | 0.35015 |
| H | mean_press | -7.49653 | 8.88233 | -0.84398 | 0.39988 |
| H | time_betw_02_03 | 0.00026 | 0.00031 | 0.83270 | 0.4062 |
| H | precip | 0.01484 | 0.02994 | 0.49574 | 0.62073 |
| H | time_less_00 | 0.00006 | 0.00016 | 0.36109 | 0.71848 |
| H | snow_fall | 0.00339 | 0.00948 | 0.35727 | 0.72134 |
| H | sd_press | -2.37194 | 7.45432 | -0.31820 | 0.75073 |
| H | snow_depth | -0.00109 | 0.00398 | -0.27476 | 0.78384 |
| H | time_greater_03 | 0.00011 | 0.00065 | 0.16109 | 0.87222 |
| H | temp_min | 0.00015 | 0.00114 | 0.12964 | 0.89701 |
Predicted values versus actual values.
# augmented
lms %>% unnest(augmented) %>%
select(-data,-fit,-tidied,-glanced) %>%
ggplot(aes(x=lot_yield, y=.fitted))+
geom_pointdensity(show.legend = FALSE)+
# geom_point()+
geom_abline(lty=2,color='red')+
facet_wrap(~kiln)+
scale_y_continuous(limits=c(0.6,1))+
scale_x_continuous(limits=c(0.6,1))+
scale_color_viridis_c()
To get an idea of which variables are most important we can use LASSO regression to remove the least important variables. We can also test with a random forest variable importance implementation for comparison sake.
Recall the most significant variables from the earlier linear model based on p-value. Low p-value generally indicates variables are not simply due to random effects. The rather low \(R^2\) indicates that only about 10% of the variability can be explained by the model.
kable(lm_tidy %>% dplyr::filter(kiln == "G"),
format="html",
caption = paste0("Significant variables by p-value, Model R^2: ",
round(lm_glance$r.squared[[4]], 3)),
escape = F) %>%
pack_rows( index=c("Kiln G" = 14)) %>%
kable_styling("striped", full_width=T)
| kiln | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| Kiln G | |||||
| G | (Intercept) | 0.69377 | 0.07871 | 8.81458 | 0 |
| G | time_greater_03 | -0.00060 | 0.00019 | -3.08980 | 0.00226 |
| G | time_betw_00_01 | 0.00013 | 0.00005 | 2.65998 | 0.0084 |
| G | mean_press | 14.07606 | 5.94712 | 2.36687 | 0.01881 |
| G | time_less_00 | 0.00016 | 0.00009 | 1.90628 | 0.05793 |
| G | sd_press | 6.07222 | 3.25883 | 1.86331 | 0.06376 |
| G | aucDiff | 0.00001 | 0.00000 | 1.24822 | 0.21329 |
| G | time_betw_01_02 | -0.00006 | 0.00006 | -0.95087 | 0.34273 |
| G | snow_fall | -0.00502 | 0.00590 | -0.85106 | 0.39567 |
| G | temp_max | -0.00051 | 0.00066 | -0.77430 | 0.43959 |
| G | time_betw_02_03 | -0.00003 | 0.00010 | -0.31542 | 0.75275 |
| G | snow_depth | 0.00081 | 0.00303 | 0.26627 | 0.79028 |
| G | temp_min | -0.00003 | 0.00075 | -0.04422 | 0.96477 |
| G | precip | -0.00067 | 0.02223 | -0.03026 | 0.97589 |
LASSO regression requires scaled and centered variables and works by removing variables with high correlation or little importance by adjusting their coefficients to zero. This is still a type of linear regression but generally more reliable than relying on p-value.
# original data
df <- press_summ %>%
dplyr::filter(kiln == "G") %>%
# dplyr::select(mean_press:lot_yield, -kiln)
dplyr::select(precip:lot_yield, -kiln)
# splits
set.seed(323)
pressure_split <- initial_split(df)
pressure_train <- training(pressure_split)
pressure_test <- testing(pressure_split)
# lasso recipe
pressure_rec <- recipe(lot_yield ~ ., data = pressure_train) %>%
step_normalize(all_numeric(), -all_outcomes()) %>%
prep()
# lasso workflow
wf <- workflow() %>%
add_recipe(pressure_rec)
# lasso model specification
lasso_spec <- linear_reg(penalty = tune(),
mixture = 1) %>%
set_engine("glmnet")
# generate folds for tuning
set.seed(1434)
lasso_folds <- vfold_cv(pressure_train)
# generate a grid for tuning
lambda_grid <- grid_regular(penalty(), levels = 50)
# tune model
set.seed(211)
lasso_grid <- tune_grid(
wf %>% add_model(lasso_spec),
resamples = lasso_folds,
grid = lambda_grid
)
# visualize tuning results
# lasso_grid %>% collect_metrics() %>%
# ggplot(aes(penalty, mean, color=.metric))+
# geom_errorbar(aes(ymin = mean - std_err,
# ymax = mean + std_err), alpha = .4)+
# geom_point(size=2)+
# geom_line(size=1.5)+
# scale_x_log10()+
# facet_wrap(~.metric,nrow=2,scales="free")
# get best rsq value
best_rsq <- lasso_grid %>% select_best(metric = "rsq")
# best_rmse <- lasso_grid %>% select_best(metric = "rmse")
# finalize model wf
final_lasso_wf <- finalize_workflow(
wf %>% add_model(lasso_spec),
best_rsq
# best_rmse
)
# finalize model
final_lasso_fit <- last_fit(final_lasso_wf, pressure_split)
# save r2
r2 <- final_lasso_fit %>% collect_metrics() %>% select(.estimate) %>% slice(2)
# visualize variable importance on entire data
gg_lasso_importance <- final_lasso_wf %>%
fit(df) %>%
# fit(pressure_train) %>%
pull_workflow_fit() %>%
vi(lambda = best_rsq$penalty) %>%
mutate(Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)) %>%
ggplot(aes(x=Importance,y=Variable,fill=Sign))+
geom_col()+
labs(y=NULL)+
scale_x_continuous(expand = c(0,0))+
ggtitle(paste0("Variable importance using LASSO"))+
labs(subtitle = paste0("Model R^2: ", round(r2,4)))
gg_lasso_importance
# split train/test
set.seed(3332)
pressure_split <- initial_split(df)
pressure_train <- training(pressure_split)
pressure_test <- testing(pressure_split)
# rf recipe
pressure_rec <- recipe(lot_yield ~ ., data = pressure_train) %>%
prep()
# rf model specification
rf_spec <- rand_forest(mode="regression",
mtry = tune(),
trees = 1000,
min_n = tune()) %>%
set_engine("randomForest")
# workflow
wf <- workflow() %>%
add_recipe(pressure_rec)
# generate folds for tuning
set.seed(131)
rf_folds <- vfold_cv(pressure_train)
# tune on non-specific grid
tune_res <- tune_grid(
wf %>% add_model(rf_spec),
resamples = rf_folds,
grid = 20
)
# visualize first tuning results
# tune_res %>% collect_metrics() %>%
# filter(.metric == "rsq") %>%
# select(mean, min_n, mtry) %>%
# pivot_longer(min_n:mtry, values_to ="value", names_to = "parameter") %>%
# ggplot(aes(value,mean,color=parameter))+
# geom_point()+
# geom_line()+
# facet_wrap(~parameter, nrow=2, scales="free")
# generate more specific grid
rf_grid <- grid_regular(
mtry(range = c(8,15)),
min_n(range = c(15,35)),
levels = 6
)
# tune on more specific grid
regular_res <- tune_grid(
wf %>% add_model(rf_spec),
resamples = rf_folds,
grid = rf_grid
)
# visualize second tuning results
# regular_res %>% collect_metrics() %>%
# filter(.metric == "rsq") %>%
# mutate(min_n = as.factor(min_n)) %>%
# ggplot(aes(mtry, mean, color = min_n))+
# geom_point()+
# geom_line(size=1.2)
# store best models
best_rsq <- regular_res %>% select_best(metric="rsq")
best_rmse <- regular_res %>% select_best(metric="rmse")
# finalize model
final_rf <- finalize_model(
rf_spec,
best_rsq
)
# variable importance on training
# final_rf %>%
# set_engine("randomForest") %>%
# fit(lot_yield ~ ., juice(pressure_rec)) %>%
# vip(geom = "point")
# final random forest workflow
final_rf_wf <- workflow() %>%
add_recipe(pressure_rec) %>%
add_model(final_rf)
# fit model on test set
final_rf_res <- final_rf_wf %>%
last_fit(pressure_split)
# metrics for final model
r2 <- final_rf_res %>% collect_metrics() %>% select(.estimate) %>% slice(2)
# plot predictions vs results on test set
# final_rf_res %>% collect_predictions() %>%
# ggplot(aes(.pred, lot_yield))+
# geom_point()+geom_abline(lty=2,color='red')
# final model
# final_rf_model <- fit(final_rf_wf, df)
# variable importance on all data
gg_rf_importance <- final_rf %>%
set_engine("randomForest") %>%
fit(lot_yield ~ ., df) %>%
vip(geom = "point")+
ggtitle(paste0("Variable importance using RandomForest"))+
labs(subtitle = paste0("Model R^2: ", round(r2,4)))
gg_rf_importance
Recall the most significant variables from the earlier linear model based on p-value. Low p-value generally indicates variables are not simply due to random effects. The rather low \(R^2\) indicates that only about 10% of the variability can be explained by the model.
kable(lm_tidy %>% dplyr::filter(kiln == "H"),
format="html",
caption = paste0("Significant variables by p-value, Model R^2: ",
round(lm_glance$r.squared[[5]], 3)),
escape = F) %>%
pack_rows( index=c("Kiln H" = 14)) %>%
kable_styling("striped", full_width=T)
| kiln | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| Kiln H | |||||
| H | (Intercept) | 0.88829 | 0.14374 | 6.17995 | 0 |
| H | aucDiff | 0.00000 | 0.00000 | -2.91740 | 0.00401 |
| H | time_betw_01_02 | 0.00022 | 0.00014 | 1.63055 | 0.10486 |
| H | temp_max | -0.00111 | 0.00102 | -1.09009 | 0.27723 |
| H | time_betw_00_01 | 0.00011 | 0.00012 | 0.93691 | 0.35015 |
| H | mean_press | -7.49653 | 8.88233 | -0.84398 | 0.39988 |
| H | time_betw_02_03 | 0.00026 | 0.00031 | 0.83270 | 0.4062 |
| H | precip | 0.01484 | 0.02994 | 0.49574 | 0.62073 |
| H | time_less_00 | 0.00006 | 0.00016 | 0.36109 | 0.71848 |
| H | snow_fall | 0.00339 | 0.00948 | 0.35727 | 0.72134 |
| H | sd_press | -2.37194 | 7.45432 | -0.31820 | 0.75073 |
| H | snow_depth | -0.00109 | 0.00398 | -0.27476 | 0.78384 |
| H | time_greater_03 | 0.00011 | 0.00065 | 0.16109 | 0.87222 |
| H | temp_min | 0.00015 | 0.00114 | 0.12964 | 0.89701 |
LASSO regression requires scaled and centered variables and works by removing variables with high correlation or little importance by adjusting their coefficients to zero. This is still a type of linear regression but generally more reliable than relying on p-value.
# original data
df <- press_summ %>%
dplyr::filter(kiln == "H") %>%
# dplyr::select(mean_press:lot_yield, -kiln)
dplyr::select(precip:lot_yield, -kiln)
# splits
set.seed(32)
pressure_split <- initial_split(df)
pressure_train <- training(pressure_split)
pressure_test <- testing(pressure_split)
# lasso recipe
pressure_rec <- recipe(lot_yield ~ ., data = pressure_train) %>%
step_normalize(all_numeric(), -all_outcomes()) %>%
prep()
# lasso workflow
wf <- workflow() %>%
add_recipe(pressure_rec)
# lasso model specification
lasso_spec <- linear_reg(penalty = tune(),
mixture = 1) %>%
set_engine("glmnet")
# generate folds for tuning
set.seed(144)
lasso_folds <- vfold_cv(pressure_train)
# generate a grid for tuning
lambda_grid <- grid_regular(penalty(), levels = 50)
# tune model
set.seed(211)
lasso_grid <- tune_grid(
wf %>% add_model(lasso_spec),
resamples = lasso_folds,
grid = lambda_grid
)
# visualize tuning results
# lasso_grid %>% collect_metrics() %>%
# ggplot(aes(penalty, mean, color=.metric))+
# geom_errorbar(aes(ymin = mean - std_err,
# ymax = mean + std_err), alpha = .4)+
# geom_point(size=2)+
# geom_line(size=1.5)+
# scale_x_log10()+
# facet_wrap(~.metric,nrow=2,scales="free")
# get best rsq value
best_rsq <- lasso_grid %>% select_best(metric = "rsq")
# best_rmse <- lasso_grid %>% select_best(metric = "rmse")
# finalize model wf
final_lasso_wf <- finalize_workflow(
wf %>% add_model(lasso_spec),
best_rsq
# best_rmse
)
# finalize model
final_lasso_fit <- last_fit(final_lasso_wf, pressure_split)
# save r2
r2 <- final_lasso_fit %>% collect_metrics() %>% select(.estimate) %>% slice(2)
# visualize variable importance on entire data
gg_lasso_importance <- final_lasso_wf %>%
fit(df) %>%
# fit(pressure_train) %>%
pull_workflow_fit() %>%
vi(lambda = best_rsq$penalty) %>%
mutate(Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)) %>%
ggplot(aes(x=Importance,y=Variable,fill=Sign))+
geom_col()+
labs(y=NULL)+
scale_x_continuous(expand = c(0,0))+
ggtitle(paste0("Variable importance using LASSO"))+
labs(subtitle = paste0("Model R^2: ", round(r2,4)))
gg_lasso_importance
# split train/test
set.seed(3332)
pressure_split <- initial_split(df)
pressure_train <- training(pressure_split)
pressure_test <- testing(pressure_split)
# rf recipe
pressure_rec <- recipe(lot_yield ~ ., data = pressure_train) %>%
prep()
# rf model specification
rf_spec <- rand_forest(mode="regression",
mtry = tune(),
trees = 1000,
min_n = tune()) %>%
set_engine("randomForest")
# workflow
wf <- workflow() %>%
add_recipe(pressure_rec)
# generate folds for tuning
set.seed(131)
rf_folds <- vfold_cv(pressure_train)
# tune on non-specific grid
tune_res <- tune_grid(
wf %>% add_model(rf_spec),
resamples = rf_folds,
grid = 20
)
# visualize first tuning results
# tune_res %>% collect_metrics() %>%
# filter(.metric == "rsq") %>%
# select(mean, min_n, mtry) %>%
# pivot_longer(min_n:mtry, values_to ="value", names_to = "parameter") %>%
# ggplot(aes(value,mean,color=parameter))+
# geom_point()+
# geom_line()+
# facet_wrap(~parameter, nrow=2, scales="free")
# generate more specific grid
rf_grid <- grid_regular(
mtry(range = c(1,5)),
min_n(range = c(10,30)),
levels = 7
)
# tune on more specific grid
regular_res <- tune_grid(
wf %>% add_model(rf_spec),
resamples = rf_folds,
grid = rf_grid
)
# visualize second tuning results
# regular_res %>% collect_metrics() %>%
# filter(.metric == "rsq") %>%
# mutate(min_n = as.factor(min_n)) %>%
# ggplot(aes(mtry, mean, color = min_n))+
# geom_point()+
# geom_line(size=1.2)
# store best models
best_rsq <- regular_res %>% select_best(metric="rsq")
best_rmse <- regular_res %>% select_best(metric="rmse")
# finalize model
final_rf <- finalize_model(
rf_spec,
best_rsq
)
# variable importance on training
# final_rf %>%
# set_engine("randomForest") %>%
# fit(lot_yield ~ ., juice(pressure_rec)) %>%
# vip(geom = "point")
# final random forest workflow
final_rf_wf <- workflow() %>%
add_recipe(pressure_rec) %>%
add_model(final_rf)
# fit model on test set
final_rf_res <- final_rf_wf %>%
last_fit(pressure_split)
# metrics for final model
r2 <- final_rf_res %>% collect_metrics() %>% select(.estimate) %>% slice(2)
# plot predictions vs results on test set
# final_rf_res %>% collect_predictions() %>%
# ggplot(aes(.pred, lot_yield))+
# geom_point()+geom_abline(lty=2,color='red')
# final model
# final_rf_model <- fit(final_rf_wf, df)
# variable importance on all data
gg_rf_importance <- final_rf %>%
set_engine("randomForest") %>%
fit(lot_yield ~ ., df) %>%
vip(geom = "point")+
ggtitle(paste0("Variable importance using RandomForest"))+
labs(subtitle = paste0("Model R^2: ", round(r2,4)))
gg_rf_importance
Recall the most significant variables from the earlier linear model based on p-value. Low p-value generally indicates variables are not simply due to random effects. The rather low \(R^2\) indicates that only about 10% of the variability can be explained by the model.
kable(lm_tidy %>% dplyr::filter(kiln == "D"),
format="html",
caption = paste0("Significant variables by p-value, Model R^2: ",
round(lm_glance$r.squared[[2]], 3)),
escape = F) %>%
pack_rows( index=c("Kiln D" = 14)) %>%
kable_styling("striped", full_width=T)
| kiln | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| Kiln D | |||||
| D | (Intercept) | 0.91481 | 0.05624 | 16.26736 | 0 |
| D | time_betw_02_03 | -0.05878 | 0.02283 | -2.57484 | 0.01205 |
| D | aucDiff | 0.00000 | 0.00000 | -2.18340 | 0.03222 |
| D | snow_fall | 0.02275 | 0.01551 | 1.46668 | 0.14676 |
| D | time_less_00 | 0.00007 | 0.00006 | 1.35300 | 0.18023 |
| D | time_betw_01_02 | -0.00128 | 0.00117 | -1.08975 | 0.27941 |
| D | snow_depth | -0.00339 | 0.00344 | -0.98540 | 0.32768 |
| D | time_greater_03 | 0.01171 | 0.01829 | 0.64028 | 0.524 |
| D | sd_press | -3.84453 | 7.07149 | -0.54367 | 0.58833 |
| D | time_betw_00_01 | 0.00004 | 0.00009 | 0.48635 | 0.62817 |
| D | mean_press | 4.34886 | 14.00702 | 0.31048 | 0.75708 |
| D | temp_min | -0.00027 | 0.00105 | -0.25494 | 0.79948 |
| D | precip | 0.00619 | 0.02769 | 0.22368 | 0.82363 |
| D | temp_max | 0.00008 | 0.00095 | 0.07914 | 0.93714 |
LASSO regression requires scaled and centered variables and works by removing variables with high correlation or little importance by adjusting their coefficients to zero. This is still a type of linear regression but generally more reliable than relying on p-value.
# original data
df <- press_summ %>%
dplyr::filter(kiln == "D") %>%
# dplyr::select(mean_press:lot_yield, -kiln)
dplyr::select(precip:lot_yield, -kiln)
# splits
set.seed(32)
pressure_split <- initial_split(df)
pressure_train <- training(pressure_split)
pressure_test <- testing(pressure_split)
# lasso recipe
pressure_rec <- recipe(lot_yield ~ ., data = pressure_train) %>%
step_normalize(all_numeric(), -all_outcomes()) %>%
prep()
# lasso workflow
wf <- workflow() %>%
add_recipe(pressure_rec)
# lasso model specification
lasso_spec <- linear_reg(penalty = tune(),
mixture = 1) %>%
set_engine("glmnet")
# generate folds for tuning
set.seed(144)
lasso_folds <- vfold_cv(pressure_train)
# generate a grid for tuning
lambda_grid <- grid_regular(penalty(), levels = 50)
# tune model
set.seed(211)
lasso_grid <- tune_grid(
wf %>% add_model(lasso_spec),
resamples = lasso_folds,
grid = lambda_grid
)
# visualize tuning results
# lasso_grid %>% collect_metrics() %>%
# ggplot(aes(penalty, mean, color=.metric))+
# geom_errorbar(aes(ymin = mean - std_err,
# ymax = mean + std_err), alpha = .4)+
# geom_point(size=2)+
# geom_line(size=1.5)+
# scale_x_log10()+
# facet_wrap(~.metric,nrow=2,scales="free")
# get best rsq value
best_rsq <- lasso_grid %>% select_best(metric = "rsq")
# best_rmse <- lasso_grid %>% select_best(metric = "rmse")
# finalize model wf
final_lasso_wf <- finalize_workflow(
wf %>% add_model(lasso_spec),
best_rsq
# best_rmse
)
# finalize model
final_lasso_fit <- last_fit(final_lasso_wf, pressure_split)
# save r2
r2 <- final_lasso_fit %>% collect_metrics() %>% select(.estimate) %>% slice(2)
# visualize variable importance on entire data
gg_lasso_importance <- final_lasso_wf %>%
fit(df) %>%
# fit(pressure_train) %>%
pull_workflow_fit() %>%
vi(lambda = best_rsq$penalty) %>%
mutate(Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)) %>%
ggplot(aes(x=Importance,y=Variable,fill=Sign))+
geom_col()+
labs(y=NULL)+
scale_x_continuous(expand = c(0,0))+
ggtitle(paste0("Variable importance using LASSO"))+
labs(subtitle = paste0("Model R^2: ", round(r2,4)))
gg_lasso_importance
# split train/test
set.seed(3332)
pressure_split <- initial_split(df)
pressure_train <- training(pressure_split)
pressure_test <- testing(pressure_split)
# rf recipe
pressure_rec <- recipe(lot_yield ~ ., data = pressure_train) %>%
prep()
# rf model specification
rf_spec <- rand_forest(mode="regression",
mtry = tune(),
trees = 1000,
min_n = tune()) %>%
set_engine("randomForest")
# workflow
wf <- workflow() %>%
add_recipe(pressure_rec)
# generate folds for tuning
set.seed(131)
rf_folds <- vfold_cv(pressure_train)
# tune on non-specific grid
tune_res <- tune_grid(
wf %>% add_model(rf_spec),
resamples = rf_folds,
grid = 20
)
# visualize first tuning results
# tune_res %>% collect_metrics() %>%
# filter(.metric == "rsq") %>%
# select(mean, min_n, mtry) %>%
# pivot_longer(min_n:mtry, values_to ="value", names_to = "parameter") %>%
# ggplot(aes(value,mean,color=parameter))+
# geom_point()+
# geom_line()+
# facet_wrap(~parameter, nrow=2, scales="free")
# generate more specific grid
rf_grid <- grid_regular(
min_n(range = c(1,20)),
mtry(range = c(10,16)),
levels = 6
)
# tune on more specific grid
regular_res <- tune_grid(
wf %>% add_model(rf_spec),
resamples = rf_folds,
grid = rf_grid
)
# visualize second tuning results
# regular_res %>% collect_metrics() %>%
# filter(.metric == "rsq") %>%
# mutate(min_n = as.factor(min_n)) %>%
# ggplot(aes(mtry, mean, color = min_n))+
# geom_point()+
# geom_line(size=1.2)
# store best models
best_rsq <- regular_res %>% select_best(metric="rsq")
best_rmse <- regular_res %>% select_best(metric="rmse")
# finalize model
final_rf <- finalize_model(
rf_spec,
best_rsq
)
# variable importance on training
# final_rf %>%
# set_engine("randomForest") %>%
# fit(lot_yield ~ ., juice(pressure_rec)) %>%
# vip(geom = "point")
# final random forest workflow
final_rf_wf <- workflow() %>%
add_recipe(pressure_rec) %>%
add_model(final_rf)
# fit model on test set
final_rf_res <- final_rf_wf %>%
last_fit(pressure_split)
# metrics for final model
r2 <- final_rf_res %>% collect_metrics() %>% select(.estimate) %>% slice(2)
# plot predictions vs results on test set
# final_rf_res %>% collect_predictions() %>%
# ggplot(aes(.pred, lot_yield))+
# geom_point()+geom_abline(lty=2,color='red')
# final model
# final_rf_model <- fit(final_rf_wf, df)
# variable importance on all data
gg_rf_importance <- final_rf %>%
set_engine("randomForest") %>%
fit(lot_yield ~ ., df) %>%
vip(geom = "point")+
ggtitle(paste0("Variable importance using RandomForest"))+
labs(subtitle = paste0("Model R^2: ", round(r2,4)))
gg_rf_importance
Rather than segregate into kilns, we group by item.
press_summ
## # A tibble: 686 x 19
## LOTNO precip snow_fall snow_depth temp_max temp_min temp_avg time_at_pos
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0102~ 0.02 0 0 37 22 29.5 132
## 2 0103~ 0.08 0.2 0 35 23 29 1144
## 3 0103~ 0 0 0 42 26 34 602
## 4 0103~ 0 0 0 42 26 34 1021
## 5 0104~ 0 0 4 27 -4 11.5 545
## 6 0104~ 0 0 4 27 -4 11.5 307
## 7 0104~ 0 0 4 27 -4 11.5 991
## 8 0104~ 0 0 0 30 26 28 154
## 9 0104~ 0 0 0 30 26 28 999
## 10 0104~ 0.16 0 0 45 34 39.5 1204
## # ... with 676 more rows, and 11 more variables: time_at_neg <dbl>,
## # mean_press <dbl>, sd_press <dbl>, time_greater_03 <dbl>,
## # time_betw_02_03 <dbl>, time_betw_01_02 <dbl>, time_betw_00_01 <dbl>,
## # time_less_00 <dbl>, aucDiff <dbl>, kiln <fct>, lot_yield <dbl>
df_merged
## # A tibble: 55,847 x 34
## FIRE_DATE month year year_month LOTNO KILN ITEM TYPE EC PPI
## <date> <chr> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 2016-01-02 Jan 2016 2016-Jan 0102~ B 854159 HF None 10
## 2 2016-01-02 Jan 2016 2016-Jan 0102~ B 852337 HF None 10
## 3 2016-01-02 Jan 2016 2016-Jan 0102~ G 853722 HF FEC 10
## 4 2016-01-02 Jan 2016 2016-Jan 0102~ G 853753 HF FEC 10
## 5 2016-01-02 Jan 2016 2016-Jan 0102~ G 853762 HF FEC 15
## 6 2016-01-02 Jan 2016 2016-Jan 0102~ G 853765 HF None 30
## 7 2016-01-02 Jan 2016 2016-Jan 0102~ G 853765 HF None 30
## 8 2016-01-02 Jan 2016 2016-Jan 0102~ G 853765 HF None 30
## 9 2016-01-02 Jan 2016 2016-Jan 0102~ G 853798 HF SEC 30
## 10 2016-01-02 Jan 2016 2016-Jan 0102~ G 853798 HF SEC 30
## # ... with 55,837 more rows, and 24 more variables: COMPOSITION <chr>,
## # DESCRIPTION <chr>, CAUSE <chr>, reject_vol_single_row_D <dbl>,
## # reject_cost_single_row_D <dbl>, TOTAL_ITEM_FIRED_Y <dbl>,
## # total_item_count_fired_D <dbl>, TOTAL_ITEM_REJECTED_Y <dbl>,
## # total_item_count_rejected_D <dbl>, total_item_pct_yield_Y <dbl>,
## # total_item_cost_of_rejects_D <dbl>, total_item_fired_vol_D <dbl>,
## # total_item_vol_fired_Y <dbl>, total_item_rejected_vol_D <dbl>,
## # total_item_vol_rejected_Y <dbl>, cost_piece <dbl>, vol_piece <dbl>,
## # COST_IN3 <dbl>, precip <dbl>, snow_fall <dbl>, snow_depth <dbl>,
## # temp_max <dbl>, temp_min <dbl>, temp_avg <dbl>
# df yields join with press summ ... do pressures impact yields of items?
# df merged/defect join with press summ ... do pressures impact on occurence of defects?